import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
dt = pd.read_csv("ggd-664.csv")
dt.head(10)
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wii Sports | Wii | 2006.0 | Sports | Nintendo | 41.36 | 28.96 | 3.77 | 8.45 | 82.53 | 76.0 | 51.0 | 8 | 322.0 | Nintendo | E |
| 1 | Super Mario Bros. | NES | 1985.0 | Platform | Nintendo | 29.08 | 3.58 | 6.81 | 0.77 | 40.24 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | Mario Kart Wii | Wii | 2008.0 | Racing | Nintendo | 15.68 | 12.76 | 3.79 | 3.29 | 35.52 | 82.0 | 73.0 | 8.3 | 709.0 | Nintendo | E |
| 3 | Wii Sports Resort | Wii | 2009.0 | Sports | Nintendo | 15.61 | 10.93 | 3.28 | 2.95 | 32.77 | 80.0 | 73.0 | 8 | 192.0 | Nintendo | E |
| 4 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | Nintendo | 11.27 | 8.89 | 10.22 | 1.00 | 31.37 | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | Tetris | GB | 1989.0 | Puzzle | Nintendo | 23.20 | 2.26 | 4.22 | 0.58 | 30.26 | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | New Super Mario Bros. | DS | 2006.0 | Platform | Nintendo | 11.28 | 9.14 | 6.50 | 2.88 | 29.80 | 89.0 | 65.0 | 8.5 | 431.0 | Nintendo | E |
| 7 | Wii Play | Wii | 2006.0 | Misc | Nintendo | 13.96 | 9.18 | 2.93 | 2.84 | 28.92 | 58.0 | 41.0 | 6.6 | 129.0 | Nintendo | E |
| 8 | New Super Mario Bros. Wii | Wii | 2009.0 | Platform | Nintendo | 14.44 | 6.94 | 4.70 | 2.24 | 28.32 | 87.0 | 80.0 | 8.4 | 594.0 | Nintendo | E |
| 9 | Duck Hunt | NES | 1984.0 | Shooter | Nintendo | 26.93 | 0.63 | 0.28 | 0.47 | 28.31 | NaN | NaN | NaN | NaN | NaN | NaN |
dt.dtypes
Name object Platform object Year_of_Release float64 Genre object Publisher object NA_Sales float64 EU_Sales float64 JP_Sales float64 Other_Sales float64 Global_Sales float64 Critic_Score float64 Critic_Count float64 User_Score object User_Count float64 Developer object Rating object dtype: object
dt.isna().sum()
Name 2 Platform 0 Year_of_Release 269 Genre 2 Publisher 54 NA_Sales 0 EU_Sales 0 JP_Sales 0 Other_Sales 0 Global_Sales 0 Critic_Score 8582 Critic_Count 8582 User_Score 6704 User_Count 9129 Developer 6623 Rating 6769 dtype: int64
#x=pd.DataFrame(dt[dt.Year_of_Release.isna()].groupby('Name')['Year_of_Release'].nunique())
#x
dt[dt.Name.isna()]
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 659 | NaN | GEN | 1993.0 | NaN | Acclaim Entertainment | 1.78 | 0.53 | 0.00 | 0.08 | 2.39 | NaN | NaN | NaN | NaN | NaN | NaN |
| 14246 | NaN | GEN | 1993.0 | NaN | Acclaim Entertainment | 0.00 | 0.00 | 0.03 | 0.00 | 0.03 | NaN | NaN | NaN | NaN | NaN | NaN |
dt.dropna(subset=['Name'], inplace=True)
duplicates = dt[dt.duplicated(['Name', 'Platform'])]
duplicates
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1591 | Need for Speed: Most Wanted | X360 | 2005.0 | Racing | Electronic Arts | 1.0 | 0.13 | 0.02 | 0.10 | 1.25 | 83.0 | 54.0 | 8.5 | 134.0 | EA Canada | T |
| 4127 | Sonic the Hedgehog | PS3 | NaN | Platform | NaN | 0.0 | 0.48 | 0.00 | 0.00 | 0.48 | 43.0 | 17.0 | 4.1 | 176.0 | Sonic Team | E10+ |
| 11716 | Need for Speed: Most Wanted | PC | 2012.0 | Racing | Electronic Arts | 0.0 | 0.06 | 0.00 | 0.02 | 0.08 | 82.0 | 19.0 | 8.5 | 525.0 | Black Box | T |
| 16233 | Madden NFL 13 | PS3 | 2012.0 | Sports | Electronic Arts | 0.0 | 0.01 | 0.00 | 0.00 | 0.01 | 83.0 | 22.0 | 5.5 | 101.0 | EA Tiburon | E |
dt.loc[dt['Name'] == 'Sonic the Hedgehog']
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 257 | Sonic the Hedgehog | GEN | 1991.0 | Platform | Sega | 3.03 | 0.91 | 0.26 | 0.13 | 4.34 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1745 | Sonic the Hedgehog | PS3 | 2006.0 | Platform | Sega | 0.41 | 0.06 | 0.04 | 0.66 | 1.16 | 43.0 | 17.0 | 4.1 | 176.0 | Sonic Team | E10+ |
| 1996 | Sonic the Hedgehog | X360 | 2006.0 | Platform | Sega | 0.44 | 0.48 | 0.00 | 0.11 | 1.04 | 46.0 | 38.0 | 4.4 | 455.0 | Sega | E10+ |
| 4127 | Sonic the Hedgehog | PS3 | NaN | Platform | NaN | 0.00 | 0.48 | 0.00 | 0.00 | 0.48 | 43.0 | 17.0 | 4.1 | 176.0 | Sonic Team | E10+ |
def adding_sales(dt, row1, row2):
for x in list(range(5,10)):
dt.iloc[row1, x] = dt.iloc[row1, x] + dt.iloc[row2, x]
updated_df = dt.drop(index=row2)
return updated_df
dt = adding_sales(dt,1745, 4127)
dt.query('Name == "Sonic the Hedgehog"')
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 257 | Sonic the Hedgehog | GEN | 1991.0 | Platform | Sega | 3.03 | 0.91 | 0.26 | 0.13 | 4.34 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1745 | Sonic the Hedgehog | PS3 | 2006.0 | Platform | Sega | 0.41 | 0.06 | 0.04 | 0.66 | 1.16 | 43.0 | 17.0 | 4.1 | 176.0 | Sonic Team | E10+ |
| 1996 | Sonic the Hedgehog | X360 | 2006.0 | Platform | Sega | 0.44 | 0.48 | 0.00 | 0.11 | 1.04 | 46.0 | 38.0 | 4.4 | 455.0 | Sega | E10+ |
dt.loc[dt['Name'] == 'Madden NFL 13']
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 507 | Madden NFL 13 | X360 | 2012.0 | Sports | Electronic Arts | 2.53 | 0.15 | 0.0 | 0.17 | 2.86 | 81.0 | 36.0 | 5.8 | 179.0 | EA Tiburon | E |
| 604 | Madden NFL 13 | PS3 | 2012.0 | Sports | Electronic Arts | 2.11 | 0.22 | 0.0 | 0.23 | 2.56 | 83.0 | 22.0 | 5.5 | 101.0 | EA Tiburon | E |
| 3986 | Madden NFL 13 | Wii | 2012.0 | Sports | Electronic Arts | 0.47 | 0.00 | 0.0 | 0.03 | 0.50 | NaN | NaN | 7.3 | 4.0 | EA Tiburon | E |
| 5887 | Madden NFL 13 | PSV | 2012.0 | Sports | Electronic Arts | 0.28 | 0.00 | 0.0 | 0.02 | 0.30 | 63.0 | 6.0 | 7.3 | 38.0 | EA Tiburon | E |
| 7067 | Madden NFL 13 | WiiU | 2012.0 | Sports | Electronic Arts | 0.21 | 0.00 | 0.0 | 0.02 | 0.23 | 75.0 | 9.0 | 6.7 | 30.0 | EA Tiburon | E |
| 16233 | Madden NFL 13 | PS3 | 2012.0 | Sports | Electronic Arts | 0.00 | 0.01 | 0.0 | 0.00 | 0.01 | 83.0 | 22.0 | 5.5 | 101.0 | EA Tiburon | E |
dt = adding_sales(dt,604, 16233)
dt.query('Name == "Madden NFL 13"')
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 507 | Madden NFL 13 | X360 | 2012.0 | Sports | Electronic Arts | 2.53 | 0.15 | 0.00 | 0.17 | 2.86 | 81.0 | 36.0 | 5.8 | 179.0 | EA Tiburon | E |
| 604 | Madden NFL 13 | PS3 | 2012.0 | Sports | Electronic Arts | 2.11 | 0.22 | 0.01 | 0.23 | 2.57 | 83.0 | 22.0 | 5.5 | 101.0 | EA Tiburon | E |
| 3986 | Madden NFL 13 | Wii | 2012.0 | Sports | Electronic Arts | 0.47 | 0.00 | 0.00 | 0.03 | 0.50 | NaN | NaN | 7.3 | 4.0 | EA Tiburon | E |
| 5887 | Madden NFL 13 | PSV | 2012.0 | Sports | Electronic Arts | 0.28 | 0.00 | 0.00 | 0.02 | 0.30 | 63.0 | 6.0 | 7.3 | 38.0 | EA Tiburon | E |
| 7067 | Madden NFL 13 | WiiU | 2012.0 | Sports | Electronic Arts | 0.21 | 0.00 | 0.00 | 0.02 | 0.23 | 75.0 | 9.0 | 6.7 | 30.0 | EA Tiburon | E |
dt.loc[dt['Name'] == 'Need for Speed: Most Wanted']
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 253 | Need for Speed: Most Wanted | PS2 | 2005.0 | Racing | Electronic Arts | 2.03 | 1.79 | 0.08 | 0.47 | 4.37 | 82.0 | 36.0 | 9.1 | 137.0 | EA Canada | T |
| 523 | Need for Speed: Most Wanted | PS3 | 2012.0 | Racing | Electronic Arts | 0.71 | 1.46 | 0.06 | 0.58 | 2.81 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1190 | Need for Speed: Most Wanted | X360 | 2012.0 | Racing | Electronic Arts | 0.62 | 0.78 | 0.01 | 0.15 | 1.56 | 83.0 | 54.0 | 8.5 | 134.0 | EA Canada | T |
| 1591 | Need for Speed: Most Wanted | X360 | 2005.0 | Racing | Electronic Arts | 1.00 | 0.13 | 0.02 | 0.10 | 1.25 | 83.0 | 54.0 | 8.5 | 134.0 | EA Canada | T |
| 1998 | Need for Speed: Most Wanted | XB | 2005.0 | Racing | Electronic Arts | 0.53 | 0.46 | 0.00 | 0.05 | 1.04 | 83.0 | 32.0 | 8.8 | 29.0 | EA Canada | T |
| 2048 | Need for Speed: Most Wanted | PSV | 2012.0 | Racing | Electronic Arts | 0.33 | 0.45 | 0.01 | 0.22 | 1.01 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3581 | Need for Speed: Most Wanted | GC | 2005.0 | Racing | Electronic Arts | 0.43 | 0.11 | 0.00 | 0.02 | 0.56 | 80.0 | 18.0 | 9.1 | 22.0 | EA Canada | T |
| 5973 | Need for Speed: Most Wanted | PC | 2005.0 | Racing | Electronic Arts | 0.02 | 0.23 | 0.00 | 0.04 | 0.29 | 82.0 | 19.0 | 8.5 | 525.0 | Black Box | T |
| 6274 | Need for Speed: Most Wanted | WiiU | 2013.0 | Racing | Electronic Arts | 0.13 | 0.12 | 0.00 | 0.02 | 0.27 | NaN | NaN | NaN | NaN | NaN | NaN |
| 6411 | Need for Speed: Most Wanted | DS | 2005.0 | Racing | Electronic Arts | 0.24 | 0.01 | 0.00 | 0.02 | 0.27 | 45.0 | 4.0 | 6.1 | 22.0 | EA Canada | E |
| 6474 | Need for Speed: Most Wanted | GBA | 2005.0 | Racing | Electronic Arts | 0.19 | 0.07 | 0.00 | 0.00 | 0.26 | NaN | NaN | 8.3 | 14.0 | EA Canada | E |
| 11716 | Need for Speed: Most Wanted | PC | 2012.0 | Racing | Electronic Arts | 0.00 | 0.06 | 0.00 | 0.02 | 0.08 | 82.0 | 19.0 | 8.5 | 525.0 | Black Box | T |
#Xbox 360 Platform
print(dt.loc[1190,'Name'])
print(dt.loc[1591,'Name'])
#PC Platform
print(dt.loc[5973,'Name'])
print(dt.loc[11716,'Name'])
Need for Speed: Most Wanted Need for Speed: Most Wanted Need for Speed: Most Wanted Need for Speed: Most Wanted
dt.loc[1190,'Name'] = 'Need for Speed: Most Wanted 2005'
dt.loc[1591,'Name'] = 'Need for Speed: Most Wanted 2012'
dt.loc[5973,'Name'] = 'Need for Speed: Most Wanted 2005'
dt.loc[11716,'Name'] = 'Need for Speed: Most Wanted 2012'
dt.loc[[1190, 1591,5973,11716],'Name']
1190 Need for Speed: Most Wanted 2005 1591 Need for Speed: Most Wanted 2012 5973 Need for Speed: Most Wanted 2005 11716 Need for Speed: Most Wanted 2012 Name: Name, dtype: object
import pandas as pd
import numpy as np
def missing_score(dt):
dt['User_Score'] = pd.to_numeric(dt['User_Score'], errors='coerce')
new_df = pd.DataFrame(columns=dt.columns)
# Group the DataFrame by game name
grouped = dt.groupby('Name')
# Loop through each group
for name, group in grouped:
# Get the mean scores for this game
user_mean = group['User_Score'].mean()
critic_mean = group['Critic_Score'].mean()
user_mean1 = group['User_Count'].mean()
critic_mean1 = group['Critic_Count'].mean()
#print(f"Game: {name}, User Mean Score: {user_mean}, Critic Mean Score: {critic_mean}")
# Replace missing user scores with the mean
group.loc[group['User_Score'].isnull(), 'User_Score'] = user_mean
# Replace missing critic scores with the mean
group.loc[group['Critic_Score'].isnull(), 'Critic_Score'] = critic_mean
group.loc[group['User_Count'].isnull(), 'User_Count'] = user_mean1
# Replace missing critic scores with the mean
group.loc[group['Critic_Count'].isnull(), 'Critic_Count'] = critic_mean1
new_df = pd.concat([new_df, group])
new_df = new_df.reset_index(drop=True)
return new_df
dt = missing_score(dt)
dt.isna().sum()
Name 0 Platform 0 Year_of_Release 268 Genre 0 Publisher 53 NA_Sales 0 EU_Sales 0 JP_Sales 0 Other_Sales 0 Global_Sales 0 Critic_Score 7619 Critic_Count 7619 User_Score 8028 User_Count 8028 Developer 6621 Rating 6767 dtype: int64
dt.dropna(subset=['Year_of_Release'], inplace=True)
dt.dropna(subset=['Publisher'], inplace=True)
dt.dropna(subset=['Critic_Score'], inplace=True)
dt.dropna(subset=['User_Score'], inplace=True)
dt.dropna(subset=['User_Count'], inplace=True)
dt.dropna(subset=['Critic_Count'], inplace=True)
dt.dropna(subset=['Developer'], inplace=True)
dt.dropna(subset=['Rating'], inplace=True)
dt.isna().sum()
Name 0 Platform 0 Year_of_Release 0 Genre 0 Publisher 0 NA_Sales 0 EU_Sales 0 JP_Sales 0 Other_Sales 0 Global_Sales 0 Critic_Score 0 Critic_Count 0 User_Score 0 User_Count 0 Developer 0 Rating 0 dtype: int64
dt['Year_of_Release'] = dt['Year_of_Release'].astype('int')
dt['User_Score'] = dt['User_Score'].astype('float')
dt['NA_Sales'] = dt['NA_Sales'].astype('float')
dt['EU_Sales'] = dt['EU_Sales'].astype('float')
dt['JP_Sales'] = dt['JP_Sales'].astype('float')
dt['Other_Sales'] = dt['Other_Sales'].astype('float')
dt['Global_Sales'] = dt['Global_Sales'].astype('float')
dt['Critic_Score'] = dt['Critic_Score'].astype('float')
dt['Critic_Count'] = dt['Critic_Count'].astype('int')
#dt[['Genre','Rating']] = dt[['Genre','Rating']].astype('category')
dt['User_Count'] = dt['User_Count'].astype('int')
dt.dtypes
Name object Platform object Year_of_Release int32 Genre object Publisher object NA_Sales float64 EU_Sales float64 JP_Sales float64 Other_Sales float64 Global_Sales float64 Critic_Score float64 Critic_Count int32 User_Score float64 User_Count int32 Developer object Rating object dtype: object
dt['User_Score'] = dt['User_Score']*10 #User score was from 1-10 and critic score was 1-100 so we normalized the scale.
df = dt
corr = dt.corr()
fig, ax = plt.subplots(figsize=(11,10))
sns.heatmap(corr, annot=True)
plt.show()
dt.dtypes
features1 = dt[['Critic_Score','User_Score','NA_Sales']]
class1 = dt[['Global_Sales']]
features_train, features_test, class_train, class_test = train_test_split(features1, class1, test_size=0.15, random_state=0)
class_test.index
Int64Index([ 6650, 12766, 7006, 11993, 15908, 11195, 11562, 356, 4045,
5766,
...
4086, 1000, 615, 13894, 13604, 2937, 14063, 15608, 9378,
4184],
dtype='int64', length=1172)
names_test = dt.loc[class_test.index, 'Name'].values
features_train= features_train.to_numpy()
features_test= features_test.to_numpy()
class_train= class_train.to_numpy()
class_test= class_test.to_numpy()
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
model = LinearRegression()
#model = SVR(kernel='linear')
model.fit(features_train, class_train.ravel())
y_pred = model.predict(features_test)
accuracy = r2_score(class_test, y_pred)
print(accuracy)
0.8805100296458697
#LR
model = LinearRegression()
model.fit(features_train, class_train.ravel())
y_pred = model.predict(features_test)
accuracy = r2_score(class_test, y_pred)
rmse = np.sqrt(mean_squared_error(class_test, y_pred))
mae = mean_absolute_error(class_test, y_pred)
print(accuracy)
print(rmse)
print(mae)
#SVR
model1 = SVR(kernel='linear')
model1.fit(features_train, class_train.ravel())
y_pred1 = model1.predict(features_test)
accuracy1 = r2_score(class_test, y_pred1)
rmse1 = np.sqrt(mean_squared_error(class_test, y_pred1))
mae1 = mean_absolute_error(class_test, y_pred1)
print(accuracy1)
print(rmse1)
print(mae1)
#RF
model2 = RandomForestRegressor(n_estimators=100)
model2.fit(features_train, class_train.ravel())
y_pred2 = model2.predict(features_test)
accuracy2 = r2_score(class_test, y_pred2)
rmse2 = np.sqrt(mean_squared_error(class_test, y_pred2))
mae2 = mean_absolute_error(class_test, y_pred2)
print(accuracy2)
print(rmse2)
print(mae2)
#DT
from sklearn.tree import DecisionTreeRegressor
# Initialize the model
model3 = DecisionTreeRegressor()
model3.fit(features_train, class_train.ravel())
y_pred3 = model3.predict(features_test)
accuracy3 = r2_score(class_test, y_pred3)
rmse3 = np.sqrt(mean_squared_error(class_test, y_pred3))
mae3 = mean_absolute_error(class_test, y_pred3)
print(accuracy3)
print(rmse3)
print(mae3)
lr = model
svr = model1
rf = model2
dtx=model3
0.8805100296458697 0.5952654121797615 0.22208546809051266 0.8663012071293303 0.6296637116330522 0.21095203617585212 0.867757407324701 0.626225285703206 0.23651090907551878 0.7377533739998539 0.881860472369909 0.318764220705347
import seaborn as sns
import matplotlib.pyplot as plt
models = ['Linear Regression', 'SVR', 'Random Forest','Decision Tree']
r_squared_values = [accuracy, accuracy1, accuracy2, accuracy3]
plt.figure(figsize=(8, 6))
ax = sns.pointplot(x=models, y=r_squared_values)
ax.set(xlabel='Models', ylabel='R-squared', title='Comparison of Accuracy(R-square)')
plt.ylim(0.65, 1.0)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
models = ['Linear Regression', 'SVR', 'Random Forest','Decision Tree']
r_squared_values = [rmse, rmse1, rmse2, rmse3]
plt.figure(figsize=(8, 6))
ax = sns.pointplot(x=models, y=r_squared_values)
ax.set(xlabel='Models', ylabel='RMSE', title='Comparison of Root Mean Square Error')
plt.ylim(0.5, 1.0)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
models = ['Linear Regression', 'SVR', 'Random Forest','Decision Tree']
r_squared_values = [mae, mae1, mae2, mae3]
plt.figure(figsize=(8, 6))
ax = sns.pointplot(x=models, y=r_squared_values)
ax.set(xlabel='Models', ylabel='RMSE', title='Comparison of Mean Absolute Error')
plt.ylim(0.15, 0.5)
plt.show()
## RUN SVR ONLY IF NECESSARY (TAKES TOO LONG TO EXECUTE)
game_index = np.random.choice(range(len(features_test)))
game_features = features_test[game_index].reshape(1, -1)
game_actual_sales = class_test[game_index]
# Predict the global sales of the game using the three models
game_lr_predicted_sales = lr.predict(game_features)
game_svr_predicted_sales = svr.predict(game_features)
game_rf_predicted_sales = rf.predict(game_features)
game_dt_predicted_sales = dtx.predict(game_features)
plt.figure(figsize=(8,6))
# Plot the predicted global sales vs. actual global sales for the game
plt.scatter(game_actual_sales, game_lr_predicted_sales, label='Linear Regression')
plt.scatter(game_actual_sales, game_svr_predicted_sales, label='Support Vector Regression')
plt.scatter(game_actual_sales, game_rf_predicted_sales, label='Random Forest Regression')
plt.scatter(game_actual_sales, game_dt_predicted_sales, label='Decision Tree Regression')
plt.xlabel('Actual Global Sales')
plt.ylabel('Predicted Global Sales')
plt.title('Comparison of Regression Models for ' + names_test[game_index])
plt.legend()
plt.show()
game_actual_sales
array([0.15])
game_lr_predicted_sales
array([0.17183536])
game_svr_predicted_sales
array([0.18291265])
game_dt_predicted_sales
array([0.09])
game_rf_predicted_sales
array([0.13])
from sklearn.preprocessing import LabelEncoder
#TEST CODE#
# Encode the Genre and Publisher columns using LabelEncoder
le_g = LabelEncoder()
le_p = LabelEncoder()
dt['Genre'] = le_g.fit_transform(dt['Genre'])
dt['Publisher'] = le_p.fit_transform(dt['Publisher'])
# Split the dataset into training and testing sets
X = dt[['Year_of_Release','Genre','Publisher','NA_Sales']]
y = dt['Global_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
# Train a linear regression model on the training set
model = LinearRegression()
model.fit(X_train, y_train)
#model = RandomForestRegressor(n_estimators=50, random_state=0)
#model.fit(X_train, y_train.ravel())
#y_pred1 = model1.predict(features_test)
# Evaluate the model on the testing set
y_pred = model.predict(X_test)
dt['Genre'] = le_g.inverse_transform(dt['Genre'].astype(int))
dt['Publisher'] = le_p.inverse_transform(dt['Publisher'].astype(int))
dt
| Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | Tales of Xillia 2 | PS3 | 2012 | Role-Playing | Namco Bandai Games | 0.20 | 0.12 | 0.45 | 0.07 | 0.84 | 71.0 | 59 | 79.0 | 216 | Bandai Namco Games | T |
| 11 | .hack//Infection Part 1 | PS2 | 2002 | Role-Playing | Atari | 0.49 | 0.38 | 0.26 | 0.13 | 1.27 | 75.0 | 35 | 85.0 | 60 | CyberConnect2 | T |
| 13 | .hack//Mutation Part 2 | PS2 | 2002 | Role-Playing | Atari | 0.23 | 0.18 | 0.20 | 0.06 | 0.68 | 76.0 | 24 | 89.0 | 81 | CyberConnect2 | T |
| 14 | .hack//Outbreak Part 3 | PS2 | 2002 | Role-Playing | Atari | 0.14 | 0.11 | 0.17 | 0.04 | 0.46 | 70.0 | 23 | 87.0 | 19 | CyberConnect2 | T |
| 17 | 007 Racing | PS | 2000 | Racing | Electronic Arts | 0.30 | 0.20 | 0.00 | 0.03 | 0.53 | 51.0 | 16 | 46.0 | 14 | Eutechnyx | T |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16702 | pro evolution soccer 2011 | X360 | 2010 | Sports | Konami Digital Entertainment | 0.09 | 0.44 | 0.00 | 0.07 | 0.61 | 79.0 | 43 | 59.0 | 33 | Konami | E |
| 16703 | pro evolution soccer 2011 | PS2 | 2010 | Sports | Konami Digital Entertainment | 0.04 | 0.21 | 0.05 | 0.11 | 0.41 | 77.4 | 22 | 67.0 | 7 | Konami | E |
| 16704 | pro evolution soccer 2011 | Wii | 2010 | Sports | Konami Digital Entertainment | 0.07 | 0.10 | 0.03 | 0.02 | 0.22 | 78.0 | 9 | 54.0 | 7 | Konami | E |
| 16711 | uDraw Studio: Instant Artist | Wii | 2011 | Misc | THQ | 0.06 | 0.09 | 0.00 | 0.02 | 0.17 | 54.0 | 5 | 57.0 | 6 | THQ | E |
| 16712 | uDraw Studio: Instant Artist | X360 | 2011 | Misc | THQ | 0.01 | 0.01 | 0.00 | 0.00 | 0.02 | 54.0 | 5 | 57.0 | 6 | THQ | E |
7810 rows × 16 columns
# Get the top publisher for each genre based on NA_Sales and Global_Sales
top_publishers = dt.groupby('Genre').agg({
'Publisher': lambda x: x.value_counts().index[0],
'NA_Sales': 'mean',
'Global_Sales': 'mean'
}).reset_index()
top_publishers
| Genre | Publisher | NA_Sales | Global_Sales | |
|---|---|---|---|---|
| 0 | Action | Activision | 0.338090 | 0.682804 |
| 1 | Adventure | Ubisoft | 0.149965 | 0.311150 |
| 2 | Fighting | Namco Bandai Games | 0.338454 | 0.620749 |
| 3 | Misc | Ubisoft | 0.495267 | 0.922305 |
| 4 | Platform | THQ | 0.424417 | 0.820961 |
| 5 | Puzzle | Nintendo | 0.274385 | 0.636462 |
| 6 | Racing | Electronic Arts | 0.349468 | 0.729911 |
| 7 | Role-Playing | Square Enix | 0.303803 | 0.696430 |
| 8 | Shooter | Electronic Arts | 0.500686 | 0.912144 |
| 9 | Simulation | Electronic Arts | 0.305210 | 0.670744 |
| 10 | Sports | Electronic Arts | 0.419583 | 0.768707 |
| 11 | Strategy | THQ | 0.121828 | 0.254122 |
# Create new rows for each genre-publisher combination with Year as 2017
new_games = []
for _, row in top_publishers.iterrows():
new_game = {'Year_of_Release': 2017, 'Genre': row['Genre'], 'Publisher': row['Publisher'], 'NA_Sales': row['NA_Sales'], 'Global_Sales': row['Global_Sales']}
new_games.append(new_game)
new_games = pd.DataFrame(new_games)
new_games
| Year_of_Release | Genre | Publisher | NA_Sales | Global_Sales | |
|---|---|---|---|---|---|
| 0 | 2017 | Action | Activision | 0.338090 | 0.682804 |
| 1 | 2017 | Adventure | Ubisoft | 0.149965 | 0.311150 |
| 2 | 2017 | Fighting | Namco Bandai Games | 0.338454 | 0.620749 |
| 3 | 2017 | Misc | Ubisoft | 0.495267 | 0.922305 |
| 4 | 2017 | Platform | THQ | 0.424417 | 0.820961 |
| 5 | 2017 | Puzzle | Nintendo | 0.274385 | 0.636462 |
| 6 | 2017 | Racing | Electronic Arts | 0.349468 | 0.729911 |
| 7 | 2017 | Role-Playing | Square Enix | 0.303803 | 0.696430 |
| 8 | 2017 | Shooter | Electronic Arts | 0.500686 | 0.912144 |
| 9 | 2017 | Simulation | Electronic Arts | 0.305210 | 0.670744 |
| 10 | 2017 | Sports | Electronic Arts | 0.419583 | 0.768707 |
| 11 | 2017 | Strategy | THQ | 0.121828 | 0.254122 |
# Fit the label encoder on the entire dataset
genre_encoder = LabelEncoder()
publisher_encoder = LabelEncoder()
genre_encoder.fit(new_games['Genre'])
publisher_encoder.fit(new_games['Publisher'])
LabelEncoder()
# Encode the genre and publisher columns in the new_games dataset
new_games['Genre'] = genre_encoder.transform(new_games['Genre'])
new_games['Publisher'] = publisher_encoder.transform(new_games['Publisher'])
# Select the features to use for prediction
features = ['Year_of_Release', 'Genre', 'Publisher', 'NA_Sales']
# Use the trained model to predict the expected global sales for each new game
expected_sales = model.predict(new_games[features])
# Add the predicted sales to the new games dataframe
new_games['Expected_Global_Sales'] = expected_sales
new_games
| Year_of_Release | Genre | Publisher | NA_Sales | Global_Sales | Expected_Global_Sales | |
|---|---|---|---|---|---|---|
| 0 | 2017 | 0 | 0 | 0.338090 | 0.682804 | 0.728906 |
| 1 | 2017 | 1 | 6 | 0.149965 | 0.311150 | 0.365073 |
| 2 | 2017 | 2 | 2 | 0.338454 | 0.620749 | 0.728611 |
| 3 | 2017 | 3 | 6 | 0.495267 | 0.922305 | 1.032494 |
| 4 | 2017 | 4 | 5 | 0.424417 | 0.820961 | 0.894333 |
| 5 | 2017 | 5 | 3 | 0.274385 | 0.636462 | 0.602583 |
| 6 | 2017 | 6 | 1 | 0.349468 | 0.729911 | 0.746881 |
| 7 | 2017 | 7 | 4 | 0.303803 | 0.696430 | 0.658353 |
| 8 | 2017 | 8 | 1 | 0.500686 | 0.912144 | 1.038362 |
| 9 | 2017 | 9 | 1 | 0.305210 | 0.670744 | 0.659016 |
| 10 | 2017 | 10 | 1 | 0.419583 | 0.768707 | 0.879842 |
| 11 | 2017 | 11 | 5 | 0.121828 | 0.254122 | 0.303232 |
# Transform the Genre and Publisher columns using the same LabelEncoder objects
new_games['Genre'] = genre_encoder.inverse_transform(new_games['Genre'].astype(int))
new_games['Publisher'] = publisher_encoder.inverse_transform(new_games['Publisher'].astype(int))
# Show the final new_games dataframe
print(new_games)
Year_of_Release Genre Publisher NA_Sales Global_Sales \
0 2017 Action Activision 0.338090 0.682804
1 2017 Adventure Ubisoft 0.149965 0.311150
2 2017 Fighting Namco Bandai Games 0.338454 0.620749
3 2017 Misc Ubisoft 0.495267 0.922305
4 2017 Platform THQ 0.424417 0.820961
5 2017 Puzzle Nintendo 0.274385 0.636462
6 2017 Racing Electronic Arts 0.349468 0.729911
7 2017 Role-Playing Square Enix 0.303803 0.696430
8 2017 Shooter Electronic Arts 0.500686 0.912144
9 2017 Simulation Electronic Arts 0.305210 0.670744
10 2017 Sports Electronic Arts 0.419583 0.768707
11 2017 Strategy THQ 0.121828 0.254122
Expected_Global_Sales
0 0.728906
1 0.365073
2 0.728611
3 1.032494
4 0.894333
5 0.602583
6 0.746881
7 0.658353
8 1.038362
9 0.659016
10 0.879842
11 0.303232
new_games
| Year_of_Release | Genre | Publisher | NA_Sales | Global_Sales | Expected_Global_Sales | |
|---|---|---|---|---|---|---|
| 0 | 2017 | Action | Activision | 0.338090 | 0.682804 | 0.728906 |
| 1 | 2017 | Adventure | Ubisoft | 0.149965 | 0.311150 | 0.365073 |
| 2 | 2017 | Fighting | Namco Bandai Games | 0.338454 | 0.620749 | 0.728611 |
| 3 | 2017 | Misc | Ubisoft | 0.495267 | 0.922305 | 1.032494 |
| 4 | 2017 | Platform | THQ | 0.424417 | 0.820961 | 0.894333 |
| 5 | 2017 | Puzzle | Nintendo | 0.274385 | 0.636462 | 0.602583 |
| 6 | 2017 | Racing | Electronic Arts | 0.349468 | 0.729911 | 0.746881 |
| 7 | 2017 | Role-Playing | Square Enix | 0.303803 | 0.696430 | 0.658353 |
| 8 | 2017 | Shooter | Electronic Arts | 0.500686 | 0.912144 | 1.038362 |
| 9 | 2017 | Simulation | Electronic Arts | 0.305210 | 0.670744 | 0.659016 |
| 10 | 2017 | Sports | Electronic Arts | 0.419583 | 0.768707 | 0.879842 |
| 11 | 2017 | Strategy | THQ | 0.121828 | 0.254122 | 0.303232 |
import plotly.express as px
plt.figure(figsize=(16,9))
# assuming your dataframe is called 'df'
fig = px.bar(new_games, x='Genre', y='Global_Sales', color='Publisher', hover_data=['Publisher'])
# update hover information to only display publisher
#fig.update_traces(hovertemplate='<br>'.join(['Publisher: %{customdata}']))
fig.show()
<Figure size 1600x900 with 0 Axes>
### MINING QUESTION HOW WILL THE GAME SALES BE IN YEAR 2017 BASED ON GENRE, (DATA IS UPTO YEAR 2016)
#sns.set(style="darkgrid")
df = new_games
# Set the width of each bar and the positions of the bars on the x-axis
bar_width = 0.35
x_pos = np.arange(len(df['Genre']))
# Create a figure and axis objects with a larger size
fig, ax = plt.subplots(figsize=(16, 9))
# Create bars for global sales and expected global sales for each publisher
global_sales_bars = ax.bar(x_pos - bar_width/2, df['Global_Sales'], bar_width, label='Global Sales')
expected_global_sales_bars = ax.bar(x_pos + bar_width/2, df['Expected_Global_Sales'], bar_width, label='Expected Global Sales')
# Add labels, title, and legend
ax.set_ylabel('Sales')
#ax.set_title('Global Sales by Publisher')
ax.set_xticks(x_pos)
ax.set_xticklabels(df['Genre'])
ax.legend()
# Show the plot
plt.show()
import plotly.graph_objects as go
df = new_games
colors = [sns.color_palette()[0], sns.color_palette()[1]]
palette = sns.color_palette(['#0072B2', '#D55E00']).as_hex()
# Create a figure with the bar chart
fig = px.bar(df, x='Genre', y=['Global_Sales', 'Expected_Global_Sales'], barmode= 'group',
hover_data=['Publisher'],color_discrete_sequence=palette)
#fig.update_traces(marker=['blue', 'orange'])
# Customize the hover text to show the top publisher and sales type
fig.update_traces(hovertemplate='Publisher: %{customdata[0]}<br>'
)
# Set the figure layout and title
fig.update_layout(
title='Global Sales by Genre',
xaxis_title='Genre',
yaxis_title='Sales'
)
# Show the plot
fig.show()
##Q 2
from sklearn.svm import SVR
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X = dt[['Year_of_Release','Critic_Score','User_Score','NA_Sales']]
y = dt[['Global_Sales']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
X_train= X_train.to_numpy()
X_test= X_test.to_numpy()
y_train= y_train.to_numpy()
y_test= y_test.to_numpy()
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#model = SVR(kernel='linear')
# Train the model on the training set
#model.fit(X_train, y_train)
# Make predictions on the testing set
#y_pred = model.predict(X_test)
# Evaluate the performance of the model
#mse = mean_squared_error(y_test, y_pred)
#print("Mean squared error: ", mse)
from sklearn.ensemble import RandomForestRegressor
#model = RandomForestRegressor(n_estimators=100, random_state=42)
# Train the model on the training set
#model.fit(X_train, y_train)
# Make predictions on the testing set
#y_pred = model.predict(X_test)
accuracy = r2_score(y_test, y_pred)
print(accuracy)
0.8811877796971053
Test_Data = pd.DataFrame(X_test)
Test_Data[0] = Test_Data[0].astype(int)
Test_Data
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | 2013 | 71.0 | 70.0 | 0.30 |
| 1 | 2001 | 89.0 | 86.0 | 0.00 |
| 2 | 2002 | 81.0 | 89.0 | 1.22 |
| 3 | 2008 | 63.0 | 79.0 | 0.09 |
| 4 | 2006 | 55.0 | 74.0 | 0.06 |
| ... | ... | ... | ... | ... |
| 1167 | 2013 | 60.0 | 72.0 | 0.38 |
| 1168 | 2004 | 75.0 | 88.0 | 0.08 |
| 1169 | 2008 | 74.0 | 68.0 | 0.22 |
| 1170 | 2008 | 67.0 | 84.0 | 0.11 |
| 1171 | 2015 | 65.0 | 63.0 | 0.09 |
1172 rows × 4 columns
dff = Test_Data.sample(n=5)#,replace=False)
dff
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 497 | 2008 | 47.0 | 70.0 | 0.49 |
| 726 | 2003 | 76.0 | 74.0 | 0.01 |
| 416 | 2012 | 88.0 | 83.0 | 0.48 |
| 140 | 2011 | 56.0 | 57.0 | 0.02 |
| 10 | 2010 | 63.0 | 72.0 | 0.23 |
import random import pandas as pd
critic_scores = dff[0] na_sales = dff[2]
original_user_scores = dff[1]
random_user_scores_sets = [] for i in range(5): random_user_scores = random.sample(range(101), 5) random_user_scores_set = [random_userscores for in range(5)] random_user_scores_sets.append(random_user_scores_set)
user_scores_sets = [list(critic_scores) + random_user_scores_set[i] + list(na_sales) for i in range(5)]
df_list = [] for i in range(5): random_user_scores_set = random_user_scores_sets[i] user_scores = list(original_user_scores) + random_user_scores_set df = pd.DataFrame({'Critic score': critic_scores5, 'User score': user_scores, 'NA Sales': na_sales5}) df_list.append(df)
import pandas as pd import random
df_test = dff df_test.columns = ['Critic score', 'User score', 'NA Sales']
critic_scores = df_test['Critic score'] na_sales = df_test['NA Sales']
random_user_scores_sets = [random.sample(range(101), 5)] * len(df_test['User score'].unique())
user_scores = [] for score in df_test['User score'].unique(): user_scores += [score] + random_user_scores_sets
critic_scores = list(critic_scores) len(df_test['User score'].unique()) na_sales = list(na_sales) len(df_test['User score'].unique())
df = pd.DataFrame({'Critic score': critic_scores 5, 'User score': user_scores 5, 'NA Sales': na_sales * 5})
import pandas as pd
df =dff df.columns = ['Year_of_Release','Critic score', 'User score', 'NA Sales']
critic_scores = df_test['Critic score'] na_sales = df_test['NA Sales'] year = df_test['Year_of_Release'] user = df_test['User score']
def generate_consecutive_years(year): return list(range(year + 1, year + 6))
consecutive_years_sets = [] for year in df['Year_of_Release']: consecutive_years_sets += [generate_consecutive_years(year)] * 5
consecutive_years = [] for year in df['Year_of_Release'].unique(): consecutive_years += [year] * 5 + consecutive_years_sets.pop(0)
critic_scores = np.repeat(df_test['Critic score'], 6).tolist() na_sales = np.repeat(df_test['NA Sales'], 6).tolist() user = np.repeat(df_test['User score'],6).tolist()
df['Consecutive Years'] = consecutive_years
import pandas as pd import random
df_test = dff df_test.columns = ['Year_of_Release','Critic score', 'User score', 'NA Sales']
critic_scores = df_test['Critic score'] na_sales = df_test['NA Sales'] year = df_test['Year_of_Release'] user = df_test['User score'] value_set = [2016,2017,2018,2019,2020] random_user_scores_sets = [valueset[:] for in range(5)]
user_scores = [] for score in df_test['Year_of_Release']: user_scores += [score] user_scores += random_user_scores_sets.pop(0)
critic_scores = np.repeat(df_test['Critic score'], 6).tolist() na_sales = np.repeat(df_test['NA Sales'], 6).tolist() user = np.repeat(df_test['User score'],6).tolist()
dfx = pd.DataFrame({'Year_of_Release': user_scores, 'Critic score': critic_scores, 'User score': user, 'NA Sales': na_sales})
#NEW TEST FOR CONSECUtive years
#dff[0] = dff[0].astype(int)
import pandas as pd
import numpy as np
# Read in the original dataframe with Critic score, User score, and NA Sales
df_test = dff
df_test.columns = ['Year_of_Release','Critic score', 'User score', 'NA Sales']
# Define the original Critic scores and NA Sales
critic_scores = df_test['Critic score']
na_sales = df_test['NA Sales']
user = df_test['User score']
# Create a list of 5 consecutive years for each unique value of Year_of_Release
year_sets = []
for year in df_test['Year_of_Release']:
year_set = list(range(year, year+6))
year_sets += [year_set[1:]]
# Flatten the year sets into a single list
year_of_release = []
for year in df_test['Year_of_Release']:
year_of_release += [year]
year_of_release += year_sets.pop(0)
# Repeat the original Critic scores and NA Sales for each set of random user scores
#critic_scores = list(critic_scores) * 6
#na_sales = list(na_sales) * 6
# Repeat the critic score and NA sales 5 times each
critic_scores = np.repeat(df_test['Critic score'], 6).tolist()
na_sales = np.repeat(df_test['NA Sales'], 6).tolist()
user = np.repeat(df_test['User score'],6).tolist()
# Create a dataframe with the correct length for each column
dfx = pd.DataFrame({'Year_of_Release': year_of_release,
'Critic score': critic_scores,
'User score': user,
'NA Sales': na_sales})
dfx
| Year_of_Release | Critic score | User score | NA Sales | |
|---|---|---|---|---|
| 0 | 2008 | 47.0 | 70.0 | 0.49 |
| 1 | 2009 | 47.0 | 70.0 | 0.49 |
| 2 | 2010 | 47.0 | 70.0 | 0.49 |
| 3 | 2011 | 47.0 | 70.0 | 0.49 |
| 4 | 2012 | 47.0 | 70.0 | 0.49 |
| 5 | 2013 | 47.0 | 70.0 | 0.49 |
| 6 | 2003 | 76.0 | 74.0 | 0.01 |
| 7 | 2004 | 76.0 | 74.0 | 0.01 |
| 8 | 2005 | 76.0 | 74.0 | 0.01 |
| 9 | 2006 | 76.0 | 74.0 | 0.01 |
| 10 | 2007 | 76.0 | 74.0 | 0.01 |
| 11 | 2008 | 76.0 | 74.0 | 0.01 |
| 12 | 2012 | 88.0 | 83.0 | 0.48 |
| 13 | 2013 | 88.0 | 83.0 | 0.48 |
| 14 | 2014 | 88.0 | 83.0 | 0.48 |
| 15 | 2015 | 88.0 | 83.0 | 0.48 |
| 16 | 2016 | 88.0 | 83.0 | 0.48 |
| 17 | 2017 | 88.0 | 83.0 | 0.48 |
| 18 | 2011 | 56.0 | 57.0 | 0.02 |
| 19 | 2012 | 56.0 | 57.0 | 0.02 |
| 20 | 2013 | 56.0 | 57.0 | 0.02 |
| 21 | 2014 | 56.0 | 57.0 | 0.02 |
| 22 | 2015 | 56.0 | 57.0 | 0.02 |
| 23 | 2016 | 56.0 | 57.0 | 0.02 |
| 24 | 2010 | 63.0 | 72.0 | 0.23 |
| 25 | 2011 | 63.0 | 72.0 | 0.23 |
| 26 | 2012 | 63.0 | 72.0 | 0.23 |
| 27 | 2013 | 63.0 | 72.0 | 0.23 |
| 28 | 2014 | 63.0 | 72.0 | 0.23 |
| 29 | 2015 | 63.0 | 72.0 | 0.23 |
import pandas as pd import random
df_test = dfx
critic_scores = df_test['Critic score'] na_sales = df_test['NA Sales']
value_set = [0,20,40,60,80] random_user_scores_sets = [valueset[:] for in range(5)]
user_scores = [] for score in df_test['User score']: user_scores += [score] user_scores += random_user_scores_sets.pop(0)
critic_scores = np.repeat(df_test['Critic score'], 6).tolist() na_sales = np.repeat(df_test['NA Sales'], 6).tolist()
dfx = pd.DataFrame({'Critic score': critic_scores, 'User score': user_scores, 'NA Sales': na_sales})
for test_set in df:
# Extract the features (Critic score, User score, NA Sales) from the test set
X_test = test_set.iloc[:, [0, 1, 2]].values
# Make predictions using the trained model
y_pred = reg_model.predict(X_test)
# Print the predicted values for the test set
print(y_pred)
y_pred = model.predict(dfx)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but LinearRegression was fitted without feature names warnings.warn(
y_pred_df = pd.DataFrame(y_pred)
dfx['PGS'] = y_pred_df
dfx['Year_of_Release'] = dfx['Year_of_Release'].astype('int')
import matplotlib.pyplot as plt
import seaborn as sns
# Split the original dataframe into 5 dataframes
split_df = np.array_split(dfx, 5)
# Convert the year column to integer
for df in split_df:
df['Year_of_Release'] = df['Year_of_Release'].astype(int)
# Plot the predicted global sales for each of the 5 dataframes
sns.set(style='white')
fig, ax = plt.subplots(figsize=(8, 6))
for i in range(5):
ax.plot(split_df[i]['Year_of_Release'], split_df[i]['PGS'], label=f'Test Set {i+1}')
ax.legend()
ax.set_xlabel('Release Years')
ax.set_ylabel('Predicted global sales')
plt.show()
split_df = np.array_split(dfx, 5)
ts1=split_df[0]
ts2=split_df[1]
ts3=split_df[2]
ts4=split_df[3]
ts5=split_df[4]
dfs = [ts1, ts2, ts3, ts4, ts5]
dfs
[ Year_of_Release Critic score User score NA Sales PGS
0 2016 80.333333 72.0 0.01 0.140353
1 2017 80.333333 72.0 0.01 0.150622
2 2018 80.333333 72.0 0.01 0.160890
3 2019 80.333333 72.0 0.01 0.171158
4 2020 80.333333 72.0 0.01 0.181427
5 2021 80.333333 72.0 0.01 0.191695,
Year_of_Release Critic score User score NA Sales PGS
6 2004 72.0 84.0 0.4 0.759638
7 2005 72.0 84.0 0.4 0.769906
8 2006 72.0 84.0 0.4 0.780175
9 2007 72.0 84.0 0.4 0.790443
10 2008 72.0 84.0 0.4 0.800711
11 2009 72.0 84.0 0.4 0.810980,
Year_of_Release Critic score User score NA Sales PGS
12 2003 81.0 81.0 0.3 0.573045
13 2004 81.0 81.0 0.3 0.583314
14 2005 81.0 81.0 0.3 0.593582
15 2006 81.0 81.0 0.3 0.603850
16 2007 81.0 81.0 0.3 0.614119
17 2008 81.0 81.0 0.3 0.624387,
Year_of_Release Critic score User score NA Sales PGS
18 2009 43.0 71.0 0.27 0.493269
19 2010 43.0 71.0 0.27 0.503537
20 2011 43.0 71.0 0.27 0.513806
21 2012 43.0 71.0 0.27 0.524074
22 2013 43.0 71.0 0.27 0.534342
23 2014 43.0 71.0 0.27 0.544611,
Year_of_Release Critic score User score NA Sales PGS
24 2008 70.0 63.0 0.07 0.147552
25 2009 70.0 63.0 0.07 0.157820
26 2010 70.0 63.0 0.07 0.168088
27 2011 70.0 63.0 0.07 0.178357
28 2012 70.0 63.0 0.07 0.188625
29 2013 70.0 63.0 0.07 0.198893]
import matplotlib.pyplot as plt
import seaborn as sns
# plot the predicted global sales for dfs[0]
sns.set(style='white')
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(dfs[0]['Year_of_Release'], dfs[0]['PGS'], label=f'Test Set 1')
ax.legend()
ax.set_xlabel('Release Years')
ax.set_ylabel('Predicted global sales')
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid')
# plot the data for ts1 using sns.pointplot
fig, ax = plt.subplots(figsize=(8, 6))
sns.pointplot(x='Year_of_Release', y='PGS', data=ts1, ax=ax)
# set the axis labels and title
ax.set_xlabel('Release Years')
ax.set_ylabel('Predicted global sales')
ax.set_title('Predicted Global Sales for Test Set 1')
# add the legend
plt.show()
#DUP
import matplotlib.pyplot as plt
# create a list of 5 DataFrames
sns.set(style='white')
# plot the predicted global sales for each of the 6 user sales in each DataFrame
#plt.figure(figsize=(12,6))
fig, ax = plt.subplots(figsize=(12, 6))
#fig, ax = plt.subplots(figsize=(16, 9))
for i in range(5):
ax.plot(dfs[i]['Year_of_Release'], dfs[i]['PGS'], label=f'Test Set {i+1}')
ax.legend()
ax.set_xlabel('Release Years')
ax.set_ylabel('Predicted global sales')
plt.show()